sessionInfo()
## R version 4.3.2 (2023-10-31)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Sonoma 14.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Los_Angeles
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods  
## [7] base     
## 
## other attached packages:
## [1] knitr_1.45
## 
## loaded via a namespace (and not attached):
##  [1] digest_0.6.33     R6_2.5.1          fastmap_1.1.1    
##  [4] xfun_0.41         cachem_1.0.8      htmltools_0.5.7  
##  [7] rmarkdown_2.25    lifecycle_1.0.4   cli_3.6.2        
## [10] sass_0.4.8        jquerylib_0.1.4   compiler_4.3.2   
## [13] rstudioapi_0.15.0 tools_4.3.2       evaluate_0.23    
## [16] bslib_0.6.1       yaml_2.3.8        formatR_1.14     
## [19] rlang_1.1.2       jsonlite_1.8.8
getwd()
## [1] "/Users/heerpatel/Desktop/MGSC-310"

library("tidyverse")
library("readr")
library("dplyr")
library("ggplot2")
library("ggthemes")
library("ggrepel")
library("forcats")
library("formatR")
library("rsample")
library("purrr")
library("plotROC")
library("glmnet")
library("glmnetUtils")
library("yardstick")
library("corrplot")
library("tidymodels")
library(sjPlot)

Data Handling/Cleaning

Adding the dataset


songs <- read.csv("datasets/Spotify_Youtube.csv")
names(songs)
##  [1] "X"                "Artist"           "Url_spotify"     
##  [4] "Track"            "Album"            "Album_type"      
##  [7] "Uri"              "Danceability"     "Energy"          
## [10] "Key"              "Loudness"         "Speechiness"     
## [13] "Acousticness"     "Instrumentalness" "Liveness"        
## [16] "Valence"          "Tempo"            "Duration_ms"     
## [19] "Url_youtube"      "Title"            "Channel"         
## [22] "Views"            "Likes"            "Comments"        
## [25] "Description"      "Licensed"         "official_video"  
## [28] "Stream"
print(sapply(songs, class))
##                X           Artist      Url_spotify            Track 
##        "integer"      "character"      "character"      "character" 
##            Album       Album_type              Uri     Danceability 
##      "character"      "character"      "character"        "numeric" 
##           Energy              Key         Loudness      Speechiness 
##        "numeric"        "numeric"        "numeric"        "numeric" 
##     Acousticness Instrumentalness         Liveness          Valence 
##        "numeric"        "numeric"        "numeric"        "numeric" 
##            Tempo      Duration_ms      Url_youtube            Title 
##        "numeric"        "numeric"      "character"      "character" 
##          Channel            Views            Likes         Comments 
##      "character"        "numeric"        "numeric"        "numeric" 
##      Description         Licensed   official_video           Stream 
##      "character"      "character"      "character"        "numeric"

Removing missing values


songs <- songs %>%
    na.omit()

missing_values <- songs %>%
    summarise_all(~sum(is.na(.)))

print(missing_values[missing_values > 0])
## integer(0)

remove outliers


remove_outliers <- function(variable) {
    q1 <- quantile(variable, 0.25)
    q3 <- quantile(variable, 0.75)
    iqr <- q3 - q1
    lower_bound <- q1 - 1.5 * iqr
    upper_bound <- q3 + 1.5 * iqr
    return(variable >= lower_bound & variable <= upper_bound)
}

numeric_vars <- sapply(songs, is.numeric)

your_data_frame_no_outliers <- songs
for (var in names(songs)[numeric_vars]) {
    your_data_frame_no_outliers <- your_data_frame_no_outliers[remove_outliers(your_data_frame_no_outliers[[var]]),
        ]
}

Cleaning the dataset (variables)



songs_clean <- songs %>%
    mutate(Album_type = as.factor(Album_type), Licensed = as.factor(Licensed),
        official_video = as.factor(official_video)) %>%
    select(-Url_spotify, -Uri, -Url_youtube, -Description, -X) %>%
    mutate(popular = if_else(Stream > 100000000, 1, 0), popular = as.factor(popular)) %>%
    mutate(Channel = as.factor(Channel), channel_factor = fct_lump_n(Channel,
        n = 10)) %>%
    mutate(Artist = as.factor(Artist)) %>%
    mutate(log_likes = log(Likes)) %>%
    mutate(log_dance = log(Danceability))

levels(songs_clean$channel_factor)
##  [1] "Atlantic Records"   "DisneyMusicVEVO"    "RHINO"             
##  [4] "SMTOWN"             "Sony Music India"   "SonyMusicIndiaVEVO"
##  [7] "SonyMusicSouthVEVO" "T-Series"           "YRF"               
## [10] "Zee Music Company"  "Other"

songs_clean$Speechiness <- round(songs_clean$Speechiness, 3)
songs_clean$Instrumentalness <- round(songs_clean$Instrumentalness, 3)
songs_clean$Acousticness <- round(songs_clean$Acousticness, 3)
songs_clean$Liveness <- round(songs_clean$Liveness, 3)
songs_clean$Danceability <- round(songs_clean$Danceability, 3)
songs_clean$Energy <- round(songs_clean$Energy, 3)
songs_clean$Valence <- round(songs_clean$Valence, 3)


glimpse(songs_clean)
## Rows: 19,549
## Columns: 27
## $ Artist           <fct> "Gorillaz", "Gorillaz", "Gorillaz", "Gorill…
## $ Track            <chr> "Feel Good Inc.", "Rhinestone Eyes", "New G…
## $ Album            <chr> "Demon Days", "Plastic Beach", "New Gold (f…
## $ Album_type       <fct> album, album, single, album, album, album, …
## $ Danceability     <dbl> 0.818, 0.676, 0.695, 0.689, 0.663, 0.760, 0…
## $ Energy           <dbl> 0.705, 0.703, 0.923, 0.739, 0.694, 0.891, 0…
## $ Key              <dbl> 6, 8, 1, 2, 10, 11, 4, 11, 2, 10, 9, 4, 9, …
## $ Loudness         <dbl> -6.679, -5.815, -3.930, -5.810, -8.627, -5.…
## $ Speechiness      <dbl> 0.177, 0.030, 0.052, 0.026, 0.171, 0.037, 0…
## $ Acousticness     <dbl> 0.008, 0.087, 0.043, 0.000, 0.025, 0.023, 0…
## $ Instrumentalness <dbl> 0.002, 0.001, 0.047, 0.509, 0.000, 0.087, 0…
## $ Liveness         <dbl> 0.613, 0.046, 0.116, 0.064, 0.070, 0.298, 0…
## $ Valence          <dbl> 0.772, 0.852, 0.551, 0.578, 0.525, 0.966, 0…
## $ Tempo            <dbl> 138.559, 92.761, 108.014, 120.423, 167.953,…
## $ Duration_ms      <dbl> 222640, 200173, 215150, 233867, 340920, 245…
## $ Title            <chr> "Gorillaz - Feel Good Inc. (Official Video)…
## $ Channel          <fct> "Gorillaz", "Gorillaz", "Gorillaz", "Gorill…
## $ Views            <dbl> 693555221, 72011645, 8435055, 211754952, 61…
## $ Likes            <dbl> 6220896, 1079128, 282142, 1788577, 6197318,…
## $ Comments         <dbl> 169907, 31003, 7399, 55229, 155930, 72008, …
## $ Licensed         <fct> True, True, True, True, True, True, False, …
## $ official_video   <fct> True, True, True, True, True, True, True, F…
## $ Stream           <dbl> 1040234854, 310083733, 63063467, 434663559,…
## $ popular          <fct> 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1…
## $ channel_factor   <fct> Other, Other, Other, Other, Other, Other, O…
## $ log_likes        <dbl> 15.643425, 13.891664, 12.550166, 14.396931,…
## $ log_dance        <dbl> -0.2008929, -0.3915622, -0.3638434, -0.3725…

More removing outliers


Q1 <- quantile(songs_clean$Likes, 0.25)
Q3 <- quantile(songs_clean$Likes, 0.75)
IQR <- Q3 - Q1

lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR

songs_clean <- songs_clean[songs_clean$Likes >= lower_bound & songs_clean$Likes <=
    upper_bound, ]



songs_clean <- songs_clean[complete.cases(songs_clean$log_likes) & is.finite(songs_clean$log_likes),
    ]

songs_clean <- songs_clean[complete.cases(songs_clean$log_dance) & is.finite(songs_clean$log_dance),
    ]

Splitting the dataset into testing and training datasets


songs_split <- initial_split(songs_clean, prop = 0.75)
songs_train <- training(songs_split)
songs_test <- testing(songs_split)

Final Regression Model

Version 5


lin_mod5 <- lm(log_likes ~ Danceability + Licensed + official_video + Valence +
    channel_factor + Loudness + Liveness, data = songs_train)

# Check the summary of the model
summary(lin_mod5)
## 
## Call:
## lm(formula = log_likes ~ Danceability + Licensed + official_video + 
##     Valence + channel_factor + Loudness + Liveness, data = songs_train)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.775 -1.074  0.301  1.411  7.214 
## 
## Coefficients:
##                                   Estimate Std. Error t value
## (Intercept)                       9.568407   0.480019  19.933
## Danceability                      0.933487   0.125882   7.416
## LicensedTrue                      0.330246   0.065996   5.004
## official_videoTrue                1.490195   0.072101  20.668
## Valence                          -0.373176   0.082741  -4.510
## channel_factorDisneyMusicVEVO     1.725819   0.611499   2.822
## channel_factorRHINO               0.369808   0.568757   0.650
## channel_factorSMTOWN              2.144931   0.725793   2.955
## channel_factorSony Music India    1.614380   0.627562   2.572
## channel_factorSonyMusicIndiaVEVO  1.956641   0.539356   3.628
## channel_factorSonyMusicSouthVEVO  1.278830   0.538172   2.376
## channel_factorT-Series            1.874958   0.510177   3.675
## channel_factorYRF                 2.497900   0.616816   4.050
## channel_factorZee Music Company   1.784397   0.584305   3.054
## channel_factorOther               0.549426   0.470567   1.168
## Loudness                          0.112633   0.004175  26.976
## Liveness                         -0.399363   0.106853  -3.738
##                                              Pr(>|t|)    
## (Intercept)                      < 0.0000000000000002 ***
## Danceability                        0.000000000000129 ***
## LicensedTrue                        0.000000568804402 ***
## official_videoTrue               < 0.0000000000000002 ***
## Valence                             0.000006535726002 ***
## channel_factorDisneyMusicVEVO                0.004776 ** 
## channel_factorRHINO                          0.515572    
## channel_factorSMTOWN                         0.003129 ** 
## channel_factorSony Music India               0.010109 *  
## channel_factorSonyMusicIndiaVEVO             0.000287 ***
## channel_factorSonyMusicSouthVEVO             0.017504 *  
## channel_factorT-Series                       0.000239 ***
## channel_factorYRF                   0.000051594129649 ***
## channel_factorZee Music Company              0.002264 ** 
## channel_factorOther                          0.242997    
## Loudness                         < 0.0000000000000002 ***
## Liveness                                     0.000187 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.994 on 12766 degrees of freedom
## Multiple R-squared:  0.2166, Adjusted R-squared:  0.2156 
## F-statistic: 220.6 on 16 and 12766 DF,  p-value: < 0.00000000000000022

# Plotting the coefficients with the standard error
tab_model(lin_mod5)
  log_likes
Predictors Estimates CI p
(Intercept) 9.57 8.63 – 10.51 <0.001
Danceability 0.93 0.69 – 1.18 <0.001
Licensed [True] 0.33 0.20 – 0.46 <0.001
official video [True] 1.49 1.35 – 1.63 <0.001
Valence -0.37 -0.54 – -0.21 <0.001
channel factor
[DisneyMusicVEVO]
1.73 0.53 – 2.92 0.005
channel factor [RHINO] 0.37 -0.75 – 1.48 0.516
channel factor [SMTOWN] 2.14 0.72 – 3.57 0.003
channel factor [Sony
Music India]
1.61 0.38 – 2.84 0.010
channel factor
[SonyMusicIndiaVEVO]
1.96 0.90 – 3.01 <0.001
channel factor
[SonyMusicSouthVEVO]
1.28 0.22 – 2.33 0.018
channel factor [T-Series] 1.87 0.87 – 2.87 <0.001
channel factor [YRF] 2.50 1.29 – 3.71 <0.001
channel factor [Zee Music
Company]
1.78 0.64 – 2.93 0.002
channel factor [Other] 0.55 -0.37 – 1.47 0.243
Loudness 0.11 0.10 – 0.12 <0.001
Liveness -0.40 -0.61 – -0.19 <0.001
Observations 12783
R2 / R2 adjusted 0.217 / 0.216

plot_model(lin_mod5)


tidy(lin_mod5)

Generating Predictions for the model

Generating the predictions


log_preds_train <- predict(lin_mod5, newdata = songs_train)
preds_train <- exp(log_preds_train)

log_preds_test <- predict(lin_mod5, newdata = songs_test)
preds_test <- exp(log_preds_test)

Calculating RMSE in the test and training sets


get_rmse <- function(true, predictions) {
    sqrt(mean((true - predictions)^2))
}
get_rmse(songs_train$Likes, preds_train)
## [1] 304032.7
get_rmse(songs_test$Likes, preds_test)
## [1] 309089.6

Generating prediction/true plots


results_train <- tibble(preds = preds_train, true = songs_train$Likes,
    type = "train")

results_test <- tibble(preds = preds_test, true = songs_test$Likes, type = "test")

results_df <- bind_rows(results_train, results_test)

ggplot(results_df, aes(x = true, y = preds)) + geom_point(aes(color = type),
    alpha = 1/10) + geom_abline(color = "purple") + facet_wrap(~type) +
    xlim(0, 1500000) + ylim(0, 500000) + theme_clean(base_size = 8) + theme(legend.position = "bottom")

Calculating MAE


get_mae <- function(true, predictions) {
    mean(abs(true - predictions))
}

MAE_train <- get_mae(results_test$true, results_test$preds)
MAE_test <- get_mae(results_train$true, results_train$preds)

print(MAE_train)
## [1] 180000.2
print(MAE_test)
## [1] 177188

MAE_data <- data.frame(MAE_plot <- c("MAE train", "MAE test"), MAE_plot1 <- c(MAE_train,
    MAE_test))

plot1 <- ggplot(MAE_data, aes(x = MAE_plot, y = MAE_plot1, fill = MAE_plot)) +
    geom_bar(stat = "identity") + geom_text(aes(label = MAE_plot1), position = position_stack(vjust = 0.5),
    size = 3) + labs(title = "Mean Absolute Error", x = "MAE_plot", y = "MAE_plot1") +
    theme_clean()

print(plot1)

Observations in the variables

Checking correlation


numeric_data <- songs_clean[, sapply(songs_clean, is.numeric)]

correlation_matrix <- cor(numeric_data)

print(round(correlation_matrix, 2))
##                  Danceability Energy   Key Loudness Speechiness
## Danceability             1.00   0.24  0.04     0.36        0.23
## Energy                   0.24   1.00  0.03     0.75        0.09
## Key                      0.04   0.03  1.00     0.03        0.02
## Loudness                 0.36   0.75  0.03     1.00        0.06
## Speechiness              0.23   0.09  0.02     0.06        1.00
## Acousticness            -0.28  -0.67 -0.04    -0.56       -0.10
## Instrumentalness        -0.32  -0.32  0.00    -0.55       -0.11
## Liveness                -0.08   0.18 -0.01     0.09        0.07
## Valence                  0.47   0.39  0.04     0.32        0.06
## Tempo                   -0.07   0.16  0.00     0.14        0.04
## Duration_ms             -0.09   0.03  0.00     0.01       -0.05
## Views                    0.06   0.10  0.01     0.15       -0.05
## Likes                    0.11   0.10  0.02     0.18        0.02
## Comments                 0.05   0.12  0.01     0.14        0.01
## Stream                   0.03   0.03 -0.01     0.09       -0.04
## log_likes                0.14   0.18  0.03     0.29       -0.04
## log_dance                0.96   0.29  0.04     0.43        0.20
##                  Acousticness Instrumentalness Liveness Valence Tempo
## Danceability            -0.28            -0.32    -0.08    0.47 -0.07
## Energy                  -0.67            -0.32     0.18    0.39  0.16
## Key                     -0.04             0.00    -0.01    0.04  0.00
## Loudness                -0.56            -0.55     0.09    0.32  0.14
## Speechiness             -0.10            -0.11     0.07    0.06  0.04
## Acousticness             1.00             0.29    -0.05   -0.21 -0.13
## Instrumentalness         0.29             1.00    -0.06   -0.28 -0.08
## Liveness                -0.05            -0.06     1.00    0.03  0.01
## Valence                 -0.21            -0.28     0.03    1.00  0.09
## Tempo                   -0.13            -0.08     0.01    0.09  1.00
## Duration_ms             -0.03             0.00     0.00   -0.05 -0.02
## Views                   -0.08            -0.12     0.02    0.07  0.04
## Likes                   -0.10            -0.12    -0.01    0.03  0.04
## Comments                -0.13            -0.09     0.00    0.01  0.03
## Stream                  -0.08            -0.09    -0.04   -0.01  0.01
## log_likes               -0.17            -0.21    -0.01    0.06  0.05
## log_dance               -0.31            -0.40    -0.06    0.48 -0.04
##                  Duration_ms Views Likes Comments Stream log_likes
## Danceability           -0.09  0.06  0.11     0.05   0.03      0.14
## Energy                  0.03  0.10  0.10     0.12   0.03      0.18
## Key                     0.00  0.01  0.02     0.01  -0.01      0.03
## Loudness                0.01  0.15  0.18     0.14   0.09      0.29
## Speechiness            -0.05 -0.05  0.02     0.01  -0.04     -0.04
## Acousticness           -0.03 -0.08 -0.10    -0.13  -0.08     -0.17
## Instrumentalness        0.00 -0.12 -0.12    -0.09  -0.09     -0.21
## Liveness                0.00  0.02 -0.01     0.00  -0.04     -0.01
## Valence                -0.05  0.07  0.03     0.01  -0.01      0.06
## Tempo                  -0.02  0.04  0.04     0.03   0.01      0.05
## Duration_ms             1.00  0.05  0.03     0.04  -0.01      0.05
## Views                   0.05  1.00  0.82     0.60   0.33      0.57
## Likes                   0.03  0.82  1.00     0.77   0.42      0.70
## Comments                0.04  0.60  0.77     1.00   0.28      0.55
## Stream                 -0.01  0.33  0.42     0.28   1.00      0.31
## log_likes               0.05  0.57  0.70     0.55   0.31      1.00
## log_dance              -0.09  0.07  0.11     0.05   0.04      0.16
##                  log_dance
## Danceability          0.96
## Energy                0.29
## Key                   0.04
## Loudness              0.43
## Speechiness           0.20
## Acousticness         -0.31
## Instrumentalness     -0.40
## Liveness             -0.06
## Valence               0.48
## Tempo                -0.04
## Duration_ms          -0.09
## Views                 0.07
## Likes                 0.11
## Comments              0.05
## Stream                0.04
## log_likes             0.16
## log_dance             1.00

threshold <- 0.3

strong_correlation_indices <- which(abs(correlation_matrix) > threshold &
    correlation_matrix != 1, arr.ind = TRUE)

for (i in 1:nrow(strong_correlation_indices)) {
    row_index <- strong_correlation_indices[i, 1]
    col_index <- strong_correlation_indices[i, 2]
    correlation <- correlation_matrix[row_index, col_index]
    print(paste("Variables:", colnames(correlation_matrix)[row_index],
        "and", colnames(correlation_matrix)[col_index], "Correlation:",
        round(correlation, 2)))
}
## [1] "Variables: Loudness and Danceability Correlation: 0.36"
## [1] "Variables: Instrumentalness and Danceability Correlation: -0.32"
## [1] "Variables: Valence and Danceability Correlation: 0.47"
## [1] "Variables: log_dance and Danceability Correlation: 0.96"
## [1] "Variables: Loudness and Energy Correlation: 0.75"
## [1] "Variables: Acousticness and Energy Correlation: -0.67"
## [1] "Variables: Instrumentalness and Energy Correlation: -0.32"
## [1] "Variables: Valence and Energy Correlation: 0.39"
## [1] "Variables: Danceability and Loudness Correlation: 0.36"
## [1] "Variables: Energy and Loudness Correlation: 0.75"
## [1] "Variables: Acousticness and Loudness Correlation: -0.56"
## [1] "Variables: Instrumentalness and Loudness Correlation: -0.55"
## [1] "Variables: Valence and Loudness Correlation: 0.32"
## [1] "Variables: log_dance and Loudness Correlation: 0.43"
## [1] "Variables: Energy and Acousticness Correlation: -0.67"
## [1] "Variables: Loudness and Acousticness Correlation: -0.56"
## [1] "Variables: log_dance and Acousticness Correlation: -0.31"
## [1] "Variables: Danceability and Instrumentalness Correlation: -0.32"
## [1] "Variables: Energy and Instrumentalness Correlation: -0.32"
## [1] "Variables: Loudness and Instrumentalness Correlation: -0.55"
## [1] "Variables: log_dance and Instrumentalness Correlation: -0.4"
## [1] "Variables: Danceability and Valence Correlation: 0.47"
## [1] "Variables: Energy and Valence Correlation: 0.39"
## [1] "Variables: Loudness and Valence Correlation: 0.32"
## [1] "Variables: log_dance and Valence Correlation: 0.48"
## [1] "Variables: Likes and Views Correlation: 0.82"
## [1] "Variables: Comments and Views Correlation: 0.6"
## [1] "Variables: Stream and Views Correlation: 0.33"
## [1] "Variables: log_likes and Views Correlation: 0.57"
## [1] "Variables: Views and Likes Correlation: 0.82"
## [1] "Variables: Comments and Likes Correlation: 0.77"
## [1] "Variables: Stream and Likes Correlation: 0.42"
## [1] "Variables: log_likes and Likes Correlation: 0.7"
## [1] "Variables: Views and Comments Correlation: 0.6"
## [1] "Variables: Likes and Comments Correlation: 0.77"
## [1] "Variables: log_likes and Comments Correlation: 0.55"
## [1] "Variables: Views and Stream Correlation: 0.33"
## [1] "Variables: Likes and Stream Correlation: 0.42"
## [1] "Variables: log_likes and Stream Correlation: 0.31"
## [1] "Variables: Views and log_likes Correlation: 0.57"
## [1] "Variables: Likes and log_likes Correlation: 0.7"
## [1] "Variables: Comments and log_likes Correlation: 0.55"
## [1] "Variables: Stream and log_likes Correlation: 0.31"
## [1] "Variables: Danceability and log_dance Correlation: 0.96"
## [1] "Variables: Loudness and log_dance Correlation: 0.43"
## [1] "Variables: Acousticness and log_dance Correlation: -0.31"
## [1] "Variables: Instrumentalness and log_dance Correlation: -0.4"
## [1] "Variables: Valence and log_dance Correlation: 0.48"

numeric_columns <- names(songs_clean)[sapply(songs_clean, is.numeric)]

# Calculate correlations
correlation_matrix <- cor(songs_clean[numeric_columns])

# Create a correlation heatmap
corr_heatmap <- corrplot(correlation_matrix, method = "color", addCoef.col = "black",
    order = "hclust", tl.col = "black", tl.srt = 45, tl.cex = 0.7, number.cex = 0.5)

checking histograms


hist(songs_clean$Likes)

hist(songs_clean$Danceability)

hist(songs_clean$Valence)

hist(songs_clean$Loudness)

hist(songs_clean$Liveness)

Summary stats on our dependent variable Likes


summary(songs_clean$Likes)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1   16385   88364  217023  300520 1282925

likes_summary <- summary(songs_clean$Likes)

# Bar plot for mean and median
likes_summary <- summary(songs_clean$Likes)

# Bar plot for mean and median
barplot(c(likes_summary["Mean"], likes_summary["Median"]), col = c("lightblue",
    "lightgreen"), main = "Mean and Median of Likes", ylab = "Count", names.arg = c("Mean",
    "Median"), ylim = c(0, max(likes_summary["Mean"], likes_summary["Median"]) +
    500000))

Observations for the variable Artist - find the top 15 artists based on number of likes received

# Most Viewed artist on Youtube
artists_yt <- songs_clean %>%
    group_by(Artist) %>%
    summarize(TotalViews = sum(Views)) %>%
    arrange(desc(TotalViews)) %>%
    head(15)


ggplot(artists_yt, aes(x = TotalViews, y = reorder(Artist, -TotalViews))) +
    geom_bar(stat = "identity", fill = "red") + labs(title = "Top 15 most viewed artists on Youtube",
    xlabel = "Number of views", y = "Artists") + theme_clean()




artists_yt <- songs_clean %>%
    group_by(Artist) %>%
    summarize(TotalStreams = sum(Stream)) %>%
    arrange(desc(TotalStreams)) %>%
    head(15)

# Create a horizontal bar plot
ggplot(artists_yt, aes(x = TotalStreams, y = reorder(Artist, -TotalStreams))) +
    geom_bar(stat = "identity", fill = "green") + labs(title = "Top 15 most streamed artists on Spotify",
    xlabel = "Number of streams", y = "Artists") + theme_clean()

Observations for variable Track


toppsongs_likes <- songs_clean %>%
    arrange(desc(Likes)) %>%
    slice_max(Likes, n = 8) %>%
    select(Artist, Track, Likes, Danceability, Speechiness, Acousticness,
        Instrumentalness, Liveness, Valence, Tempo, Energy)

# Create a bar plot
ggplot(toppsongs_likes, aes(x = Track, y = Likes, fill = Artist)) + geom_bar(stat = "identity",
    position = "dodge") + labs(title = "Top 5 Liked Songs", x = "Track",
    y = "Number of Likes") + scale_fill_manual(values = rep("#A52A2A",
    nrow(toppsongs_likes))) + theme_clean() + theme(axis.text.x = element_text(size = 8,
    angle = 45, hjust = 1))


unique_artists <- unique(toppsongs_likes$Artist)
artist_colors <- rainbow(length(unique_artists))

# Create a named vector with artist-color mapping
color_mapping <- setNames(artist_colors, unique_artists)

# Create a bar plot
ggplot(toppsongs_likes, aes(x = Track, y = Likes, fill = Artist)) + geom_bar(stat = "identity",
    position = "dodge") + labs(title = "Top 5 Liked Songs", x = "Track",
    y = "Number of Likes") + scale_fill_manual(values = color_mapping) +
    theme_clean() + theme(axis.text.x = element_text(size = 8, angle = 45,
    hjust = 1))

Relationship between top songs and other variables


p1 <- ggplot(toppsongs_likes, aes(x = Track, y = Danceability, fill = Track)) +
    geom_bar(stat = "identity") + labs(title = "Danceability of YouTube top tracks",
    x = "") + theme_minimal() + theme(axis.text.x = element_text(angle = 20,
    hjust = 1)) + guides(fill = FALSE)

p2 <- ggplot(toppsongs_likes, aes(x = Track, y = Energy, fill = Track)) +
    geom_bar(stat = "identity") + labs(title = "Energy of YouTube top tracks",
    x = "") + theme_minimal() + theme(axis.text.x = element_text(angle = 20,
    hjust = 1)) + guides(fill = FALSE)

p3 <- ggplot(toppsongs_likes, aes(x = Track, y = Speechiness, fill = Track)) +
    geom_bar(stat = "identity") + labs(title = "Speechiness of YouTube top tracks",
    x = "") + theme_minimal() + theme(axis.text.x = element_text(angle = 20,
    hjust = 1)) + guides(fill = FALSE)

p4 <- ggplot(toppsongs_likes, aes(x = Track, y = Valence, fill = Track)) +
    geom_bar(stat = "identity") + labs(title = "Valence of YouTube top tracks",
    x = "") + theme_minimal() + theme(axis.text.x = element_text(angle = 20,
    hjust = 1)) + guides(fill = FALSE)

p5 <- ggplot(toppsongs_likes, aes(x = Track, y = Acousticness, fill = Track)) +
    geom_bar(stat = "identity") + labs(title = "Acousticness of YouTube top tracks",
    x = "") + theme_minimal() + theme(axis.text.x = element_text(angle = 20,
    hjust = 1)) + guides(fill = FALSE)

p6 <- ggplot(toppsongs_likes, aes(x = Track, y = Liveness, fill = Track)) +
    geom_bar(stat = "identity") + labs(title = "Liveness of YouTube top tracks",
    x = "") + theme_minimal() + theme(axis.text.x = element_text(angle = 20,
    hjust = 1)) + guides(fill = FALSE)


print(p1)

print(p2)

print(p3)

print(p4)

print(p5)

print(p6)

Observations for the variable valence


ggplot(songs_clean, aes(x = Valence, y = Likes)) + geom_point(alpha = 1/10) +
    geom_smooth(method = "lm", se = FALSE, color = "blue") + labs(title = "Scatter Plot of Likes vs Valence",
    x = "Valence", y = "Likes") + ylim(0, 10000000) + theme_clean()

Finding the top channels on Youtube


library(ggplot2)

# Filter data to exclude the specific level
filtered_data <- subset(songs_clean, channel_factor != "channel_to_exclude")

# Create a bar plot
ggplot(filtered_data, aes(x = channel_factor)) + geom_bar(fill = "#A52A2A") +
    labs(title = "Count of Tracks for Each Channel Factor ", x = "Channel Factor",
        y = "Count") + ylim(0, 250) + theme(axis.text.x = element_text(size = 8,
    angle = 45, hjust = 1))

theme_minimal()
## List of 97
##  $ line                      :List of 6
##   ..$ colour       : chr "black"
##   ..$ linewidth    : num 0.5
##   ..$ linetype     : num 1
##   ..$ lineend      : chr "butt"
##   ..$ arrow        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_line" "element"
##  $ rect                      :List of 5
##   ..$ fill         : chr "white"
##   ..$ colour       : chr "black"
##   ..$ linewidth    : num 0.5
##   ..$ linetype     : num 1
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ text                      :List of 11
##   ..$ family       : chr ""
##   ..$ face         : chr "plain"
##   ..$ colour       : chr "black"
##   ..$ size         : num 11
##   ..$ hjust        : num 0.5
##   ..$ vjust        : num 0.5
##   ..$ angle        : num 0
##   ..$ lineheight   : num 0.9
##   ..$ margin       : 'margin' num [1:4] 0points 0points 0points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ title                     : NULL
##  $ aspect.ratio              : NULL
##  $ axis.title                : NULL
##  $ axis.title.x              :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 2.75points 0points 0points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.title.x.top          :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 0
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 0points 2.75points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.title.x.bottom       : NULL
##  $ axis.title.y              :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 1
##   ..$ angle        : num 90
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 2.75points 0points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.title.y.left         : NULL
##  $ axis.title.y.right        :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 0
##   ..$ angle        : num -90
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 0points 0points 2.75points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text                 :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : chr "grey30"
##   ..$ size         : 'rel' num 0.8
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.x               :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 2.2points 0points 0points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.x.top           :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 0
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 0points 2.2points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.x.bottom        : NULL
##  $ axis.text.y               :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 1
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 2.2points 0points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.y.left          : NULL
##  $ axis.text.y.right         :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 0
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 0points 0points 2.2points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.ticks                : list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ axis.ticks.x              : NULL
##  $ axis.ticks.x.top          : NULL
##  $ axis.ticks.x.bottom       : NULL
##  $ axis.ticks.y              : NULL
##  $ axis.ticks.y.left         : NULL
##  $ axis.ticks.y.right        : NULL
##  $ axis.ticks.length         : 'simpleUnit' num 2.75points
##   ..- attr(*, "unit")= int 8
##  $ axis.ticks.length.x       : NULL
##  $ axis.ticks.length.x.top   : NULL
##  $ axis.ticks.length.x.bottom: NULL
##  $ axis.ticks.length.y       : NULL
##  $ axis.ticks.length.y.left  : NULL
##  $ axis.ticks.length.y.right : NULL
##  $ axis.line                 : list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ axis.line.x               : NULL
##  $ axis.line.x.top           : NULL
##  $ axis.line.x.bottom        : NULL
##  $ axis.line.y               : NULL
##  $ axis.line.y.left          : NULL
##  $ axis.line.y.right         : NULL
##  $ legend.background         : list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ legend.margin             : 'margin' num [1:4] 5.5points 5.5points 5.5points 5.5points
##   ..- attr(*, "unit")= int 8
##  $ legend.spacing            : 'simpleUnit' num 11points
##   ..- attr(*, "unit")= int 8
##  $ legend.spacing.x          : NULL
##  $ legend.spacing.y          : NULL
##  $ legend.key                : list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ legend.key.size           : 'simpleUnit' num 1.2lines
##   ..- attr(*, "unit")= int 3
##  $ legend.key.height         : NULL
##  $ legend.key.width          : NULL
##  $ legend.text               :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : 'rel' num 0.8
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ legend.text.align         : NULL
##  $ legend.title              :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 0
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ legend.title.align        : NULL
##  $ legend.position           : chr "right"
##  $ legend.direction          : NULL
##  $ legend.justification      : chr "center"
##  $ legend.box                : NULL
##  $ legend.box.just           : NULL
##  $ legend.box.margin         : 'margin' num [1:4] 0cm 0cm 0cm 0cm
##   ..- attr(*, "unit")= int 1
##  $ legend.box.background     : list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ legend.box.spacing        : 'simpleUnit' num 11points
##   ..- attr(*, "unit")= int 8
##  $ panel.background          : list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ panel.border              : list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ panel.spacing             : 'simpleUnit' num 5.5points
##   ..- attr(*, "unit")= int 8
##  $ panel.spacing.x           : NULL
##  $ panel.spacing.y           : NULL
##  $ panel.grid                :List of 6
##   ..$ colour       : chr "grey92"
##   ..$ linewidth    : NULL
##   ..$ linetype     : NULL
##   ..$ lineend      : NULL
##   ..$ arrow        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_line" "element"
##  $ panel.grid.major          : NULL
##  $ panel.grid.minor          :List of 6
##   ..$ colour       : NULL
##   ..$ linewidth    : 'rel' num 0.5
##   ..$ linetype     : NULL
##   ..$ lineend      : NULL
##   ..$ arrow        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_line" "element"
##  $ panel.grid.major.x        : NULL
##  $ panel.grid.major.y        : NULL
##  $ panel.grid.minor.x        : NULL
##  $ panel.grid.minor.y        : NULL
##  $ panel.ontop               : logi FALSE
##  $ plot.background           : list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ plot.title                :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : 'rel' num 1.2
##   ..$ hjust        : num 0
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 0points 5.5points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ plot.title.position       : chr "panel"
##  $ plot.subtitle             :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 0
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 0points 5.5points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ plot.caption              :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : 'rel' num 0.8
##   ..$ hjust        : num 1
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 5.5points 0points 0points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ plot.caption.position     : chr "panel"
##  $ plot.tag                  :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : 'rel' num 1.2
##   ..$ hjust        : num 0.5
##   ..$ vjust        : num 0.5
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ plot.tag.position         : chr "topleft"
##  $ plot.margin               : 'margin' num [1:4] 5.5points 5.5points 5.5points 5.5points
##   ..- attr(*, "unit")= int 8
##  $ strip.background          : list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ strip.background.x        : NULL
##  $ strip.background.y        : NULL
##  $ strip.clip                : chr "inherit"
##  $ strip.placement           : chr "inside"
##  $ strip.text                :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : chr "grey10"
##   ..$ size         : 'rel' num 0.8
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 4.4points 4.4points 4.4points 4.4points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ strip.text.x              : NULL
##  $ strip.text.x.bottom       : NULL
##  $ strip.text.x.top          : NULL
##  $ strip.text.y              :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : num -90
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ strip.text.y.left         :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : num 90
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ strip.text.y.right        : NULL
##  $ strip.switch.pad.grid     : 'simpleUnit' num 2.75points
##   ..- attr(*, "unit")= int 8
##  $ strip.switch.pad.wrap     : 'simpleUnit' num 2.75points
##   ..- attr(*, "unit")= int 8
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi TRUE
##  - attr(*, "validate")= logi TRUE

Testing Different Linear Regression Models

Version 1 - low Adjusted R-squared: 0.04122 - Variables that stood out in accordance to p-value: Danceability, LicensedTrue, Instrumentalness, channel_factorSMTOWN, channel_factorSonyMusicSouthVEVO - model needs to be modified


lin_mod1 <- lm(Likes ~ Danceability + Duration_ms + channel_factor + Licensed +
    Album_type + Instrumentalness, data = songs_train)
summary(lin_mod1)
## 
## Call:
## lm(formula = Likes ~ Danceability + Duration_ms + channel_factor + 
##     Licensed + Album_type + Instrumentalness, data = songs_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -673522 -170463  -95007   71641 1181076 
## 
## Coefficients:
##                                       Estimate    Std. Error t value
## (Intercept)                       -67126.28162   65381.45034  -1.027
## Danceability                      140543.28742   15676.54422   8.965
## Duration_ms                            0.04799       0.01708   2.809
## channel_factorDisneyMusicVEVO     290960.37318   83975.55647   3.465
## channel_factorRHINO                -5009.33848   77770.54604  -0.064
## channel_factorSMTOWN              637360.63880   99318.31030   6.417
## channel_factorSony Music India    336793.67969   85896.01870   3.921
## channel_factorSonyMusicIndiaVEVO  303118.88719   73782.78287   4.108
## channel_factorSonyMusicSouthVEVO   95329.11116   73628.94574   1.295
## channel_factorT-Series            393964.17391   69850.14140   5.640
## channel_factorYRF                 510288.46290   84354.31456   6.049
## channel_factorZee Music Company   406853.27244   80016.55207   5.085
## channel_factorOther               100400.92147   64382.09563   1.559
## LicensedTrue                      132378.98162    5205.13043  25.432
## Album_typecompilation             -21412.71857   12897.58009  -1.660
## Album_typesingle                   -5722.84529    5840.48220  -0.980
## Instrumentalness                 -117439.80683   12845.04276  -9.143
##                                              Pr(>|t|)    
## (Intercept)                                  0.304587    
## Danceability                     < 0.0000000000000002 ***
## Duration_ms                                  0.004973 ** 
## channel_factorDisneyMusicVEVO                0.000532 ***
## channel_factorRHINO                          0.948643    
## channel_factorSMTOWN                   0.000000000144 ***
## channel_factorSony Music India         0.000088662986 ***
## channel_factorSonyMusicIndiaVEVO       0.000040113819 ***
## channel_factorSonyMusicSouthVEVO             0.195439    
## channel_factorT-Series                 0.000000017353 ***
## channel_factorYRF                      0.000000001495 ***
## channel_factorZee Music Company        0.000000373594 ***
## channel_factorOther                          0.118914    
## LicensedTrue                     < 0.0000000000000002 ***
## Album_typecompilation                        0.096896 .  
## Album_typesingle                             0.327175    
## Instrumentalness                 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 272900 on 12766 degrees of freedom
## Multiple R-squared:  0.09606,    Adjusted R-squared:  0.09493 
## F-statistic: 84.79 on 16 and 12766 DF,  p-value: < 0.00000000000000022

plot_model(lin_mod1)


# Generating predictions for our model
preds_train <- predict(lin_mod1)

preds_train <- predict(lin_mod1, newdata = songs_train)
preds_test <- predict(lin_mod1, newdata = songs_test)

# Calculating RMSE in the test and training sets

get_rmse <- function(true, predictions) {
    sqrt(mean((true - predictions)^2))
}
get_rmse(songs_train$Likes, preds_train)
## [1] 272678.9
get_rmse(songs_test$Likes, preds_test)
## [1] 278304

Version 2 - removing channel factor as a variable did not bring much change instead lowered the Adjusted R squared value


lin_mod2 <- lm(Likes ~ Danceability + Duration_ms + Licensed + Album_type +
    Instrumentalness, data = songs_train)

summary(lin_mod2)
## 
## Call:
## lm(formula = Likes ~ Danceability + Duration_ms + Licensed + 
##     Album_type + Instrumentalness, data = songs_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -480494 -176616  -97802   75078 1183211 
## 
## Coefficients:
##                            Estimate    Std. Error t value
## (Intercept)             31589.83073   11849.11819   2.666
## Danceability           137233.51423   15857.78031   8.654
## Duration_ms                 0.05585       0.01725   3.237
## LicensedTrue           140892.38337    5228.52666  26.947
## Album_typecompilation   -5677.87940   12923.90664  -0.439
## Album_typesingle         1943.83779    5884.33464   0.330
## Instrumentalness      -125658.16863   12988.41266  -9.675
##                                   Pr(>|t|)    
## (Intercept)                        0.00769 ** 
## Danceability          < 0.0000000000000002 ***
## Duration_ms                        0.00121 ** 
## LicensedTrue          < 0.0000000000000002 ***
## Album_typecompilation              0.66043    
## Album_typesingle                   0.74115    
## Instrumentalness      < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 276200 on 12776 degrees of freedom
## Multiple R-squared:  0.07319,    Adjusted R-squared:  0.07275 
## F-statistic: 168.1 on 6 and 12776 DF,  p-value: < 0.00000000000000022

plot_model(lin_mod2)


# Generating predictions for our model
preds_train <- predict(lin_mod2)

preds_train <- predict(lin_mod2, newdata = songs_train)
preds_test <- predict(lin_mod2, newdata = songs_test)

# Calculating RMSE in the test and training sets

get_rmse <- function(true, predictions) {
    sqrt(mean((true - predictions)^2))
}
get_rmse(songs_train$Likes, preds_train)
## [1] 276107.5
get_rmse(songs_test$Likes, preds_test)
## [1] 280038.1

Version 3 Semifinal Model


lin_mod3 <- lm(Likes ~ Danceability + Licensed + official_video + Valence +
    channel_factor + Loudness + Liveness, data = songs_train)

summary(lin_mod3)
## 
## Call:
## lm(formula = Likes ~ Danceability + Licensed + official_video + 
##     Valence + channel_factor + Loudness + Liveness, data = songs_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -671382 -180058  -85165   73167 1227352 
## 
## Coefficients:
##                                  Estimate Std. Error t value
## (Intercept)                       -2374.1    65061.7  -0.036
## Danceability                     137307.0    17062.0   8.048
## LicensedTrue                      50787.1     8945.0   5.678
## official_videoTrue               103223.3     9772.6  10.563
## Valence                          -61546.8    11214.7  -5.488
## channel_factorDisneyMusicVEVO    296863.5    82882.3   3.582
## channel_factorRHINO               24190.8    77089.1   0.314
## channel_factorSMTOWN             629730.5    98373.7   6.401
## channel_factorSony Music India   350448.0    85059.6   4.120
## channel_factorSonyMusicIndiaVEVO 331594.0    73104.1   4.536
## channel_factorSonyMusicSouthVEVO 123329.7    72943.7   1.691
## channel_factorT-Series           403857.3    69149.2   5.840
## channel_factorYRF                545792.2    83603.1   6.528
## channel_factorZee Music Company  415664.3    79196.5   5.249
## channel_factorOther              115254.5    63780.5   1.807
## Loudness                           7821.7      565.9  13.821
## Liveness                         -26734.7    14482.8  -1.846
##                                              Pr(>|t|)    
## (Intercept)                                  0.970892    
## Danceability                     0.000000000000000919 ***
## LicensedTrue                     0.000000013950030670 ***
## official_videoTrue               < 0.0000000000000002 ***
## Valence                          0.000000041418305336 ***
## channel_factorDisneyMusicVEVO                0.000343 ***
## channel_factorRHINO                          0.753676    
## channel_factorSMTOWN             0.000000000159335363 ***
## channel_factorSony Music India   0.000038121558777772 ***
## channel_factorSonyMusicIndiaVEVO 0.000005787725596645 ***
## channel_factorSonyMusicSouthVEVO             0.090908 .  
## channel_factorT-Series           0.000000005335163418 ***
## channel_factorYRF                0.000000000069004181 ***
## channel_factorZee Music Company  0.000000155784643841 ***
## channel_factorOther                          0.070778 .  
## Loudness                         < 0.0000000000000002 ***
## Liveness                                     0.064921 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 270200 on 12766 degrees of freedom
## Multiple R-squared:  0.1134, Adjusted R-squared:  0.1123 
## F-statistic: 102.1 on 16 and 12766 DF,  p-value: < 0.00000000000000022

# Plotting the coefficients with the standard error
tab_model(lin_mod3)
  Likes
Predictors Estimates CI p
(Intercept) -2374.12 -129904.75 – 125156.50 0.971
Danceability 137306.95 103862.84 – 170751.07 <0.001
Licensed [True] 50787.06 33253.48 – 68320.64 <0.001
official video [True] 103223.26 84067.55 – 122378.97 <0.001
Valence -61546.82 -83529.37 – -39564.27 <0.001
channel factor
[DisneyMusicVEVO]
296863.50 134401.73 – 459325.27 <0.001
channel factor [RHINO] 24190.80 -126915.37 – 175296.96 0.754
channel factor [SMTOWN] 629730.51 436903.32 – 822557.71 <0.001
channel factor [Sony
Music India]
350447.95 183718.40 – 517177.50 <0.001
channel factor
[SonyMusicIndiaVEVO]
331594.02 188298.98 – 474889.06 <0.001
channel factor
[SonyMusicSouthVEVO]
123329.74 -19650.81 – 266310.30 0.091
channel factor [T-Series] 403857.29 268314.51 – 539400.07 <0.001
channel factor [YRF] 545792.18 381917.58 – 709666.79 <0.001
channel factor [Zee Music
Company]
415664.34 260427.33 – 570901.36 <0.001
channel factor [Other] 115254.47 -9764.90 – 240273.84 0.071
Loudness 7821.72 6712.44 – 8931.00 <0.001
Liveness -26734.69 -55123.17 – 1653.79 0.065
Observations 12783
R2 / R2 adjusted 0.113 / 0.112

plot_model(lin_mod3)


tidy(lin_mod3)

Version 4


lin_mod4 <- lm(Likes ~ Danceability + Licensed + official_video + Valence +
    Loudness + Liveness, data = songs_train)

summary(lin_mod4)
## 
## Call:
## lm(formula = Likes ~ Danceability + Licensed + official_video + 
##     Valence + Loudness + Liveness, data = songs_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -339295 -185833  -86709   73430 1230271 
## 
## Coefficients:
##                    Estimate Std. Error t value             Pr(>|t|)
## (Intercept)        116774.8    14138.8   8.259 < 0.0000000000000002
## Danceability       134917.0    17263.7   7.815  0.00000000000000592
## LicensedTrue        59358.1     9034.5   6.570  0.00000000005221201
## official_videoTrue 102896.3     9894.3  10.400 < 0.0000000000000002
## Valence            -60256.8    11337.6  -5.315  0.00000010857173525
## Loudness             8055.1      572.3  14.075 < 0.0000000000000002
## Liveness           -32176.1    14649.0  -2.196               0.0281
##                       
## (Intercept)        ***
## Danceability       ***
## LicensedTrue       ***
## official_videoTrue ***
## Valence            ***
## Loudness           ***
## Liveness           *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 273600 on 12776 degrees of freedom
## Multiple R-squared:  0.09042,    Adjusted R-squared:  0.08999 
## F-statistic: 211.7 on 6 and 12776 DF,  p-value: < 0.00000000000000022

plot_model(lin_mod4)


# Generating predictions for our model
preds_train <- predict(lin_mod4)

preds_train <- predict(lin_mod4, newdata = songs_train)
preds_test <- predict(lin_mod4, newdata = songs_test)

# Calculating RMSE in the test and training sets

get_rmse <- function(true, predictions) {
    sqrt(mean((true - predictions)^2))
}
get_rmse(songs_train$Likes, preds_train)
## [1] 273528.1
get_rmse(songs_test$Likes, preds_test)
## [1] 276688

# Generating predictions for our model
preds_train <- predict(lin_mod3)

preds_train <- predict(lin_mod3, newdata = songs_train)
preds_test <- predict(lin_mod3, newdata = songs_test)

# Calculating RMSE in the test and training sets

get_rmse <- function(true, predictions) {
    sqrt(mean((true - predictions)^2))
}
get_rmse(songs_train$Likes, preds_train)
## [1] 270048.8
get_rmse(songs_test$Likes, preds_test)
## [1] 274708.6

# Generating prediction/true plots
results_train <- tibble(preds = preds_train, true = songs_train$Likes,
    type = "train")

results_test <- tibble(preds = preds_test, true = songs_test$Likes, type = "test")

results_df <- bind_rows(results_train, results_test)

ggplot(results_df, aes(x = true, y = preds)) + geom_point(aes(color = type),
    alpha = 1/10) + geom_abline(color = "purple") + facet_wrap(~type) +
    xlim(0, 2500000) + ylim(0, 2500000) + theme_clean(base_size = 8) +
    theme(legend.position = "bottom")


# Calculate MAE in the test and training sets

get_medae <- function(true, predictions) {
    median(abs(true - predictions))
}

get_medae(results_test$true, results_test$preds)
## [1] 154525.3
get_medae(results_train$true, results_train$preds)
## [1] 154287.3

get_mae <- function(true, predictions) {
    mean(abs(true - predictions))
}

get_mae(results_test$true, results_test$preds)
## [1] 199940.9
get_mae(results_train$true, results_train$preds)
## [1] 197787.5

Miscellaneous

Lasso Regression

used to predict which variables to use


songs_lasso <- songs_clean %>%
    select(-Artist, -Channel, -Track, -Album, -Title, -Stream, -Comments,
        -Views)

lasso_fit1 <- cv.glmnet(Likes ~ ., data = songs_lasso, alpha = 1)

print(coef(lasso_fit1, s = "lambda.min"))
## 34 x 1 sparse Matrix of class "dgCMatrix"
##                                                     s1
## (Intercept)                      -732312.4826031260891
## Album_typealbum                   -14961.2220563282881
## Album_typecompilation                  .              
## Album_typesingle                    1327.6253007368950
## Danceability                      235220.9792014449195
## Energy                              9526.1836990412248
## Key                                  358.6198671223917
## Loudness                           -1386.4072336047909
## Speechiness                       138551.5907292820921
## Acousticness                       18180.6517090925699
## Instrumentalness                   30992.5007054814087
## Liveness                           17926.2984514284108
## Valence                           -14693.7927158808579
## Tempo                                177.2259294922730
## Duration_ms                            0.0012397169376
## LicensedFalse                     -22794.3923026303673
## LicensedTrue                           0.0000004354782
## official_videoFalse                22923.0175054061292
## official_videoTrue                    -0.0000004836577
## popular0                         -155050.8781290832849
## popular1                               .              
## channel_factorAtlantic Records   -121694.3576162397803
## channel_factorDisneyMusicVEVO          .              
## channel_factorRHINO              -123071.1359877009527
## channel_factorSMTOWN              355735.8565067854943
## channel_factorSony Music India    113654.8131740860845
## channel_factorSonyMusicIndiaVEVO  101538.4225713686028
## channel_factorSonyMusicSouthVEVO  -56537.8223991831910
## channel_factorT-Series            145847.2418499816558
## channel_factorYRF                 241761.2599059541535
## channel_factorZee Music Company   147761.9635412363859
## channel_factorOther               -58058.1173608884637
## log_likes                          80571.1806766767550
## log_dance                         -96064.3879771835927